df <- read.csv("/Users/dshukla/MS/Regression/project/car_dekho.csv")
library(stringr)
library(ggplot2)
library(gridExtra)
library(grid)
library(car)
## Loading required package: carData
head(df); str(df)
## 'data.frame': 8128 obs. of 13 variables:
## $ name : chr "Maruti Swift Dzire VDI" "Skoda Rapid 1.5 TDI Ambition" "Honda City 2017-2020 EXi" "Hyundai i20 Sportz Diesel" ...
## $ year : int 2014 2014 2006 2010 2007 2017 2007 2001 2011 2013 ...
## $ selling_price: int 450000 370000 158000 225000 130000 440000 96000 45000 350000 200000 ...
## $ km_driven : int 145500 120000 140000 127000 120000 45000 175000 5000 90000 169000 ...
## $ fuel : chr "Diesel" "Diesel" "Petrol" "Diesel" ...
## $ seller_type : chr "Individual" "Individual" "Individual" "Individual" ...
## $ transmission : chr "Manual" "Manual" "Manual" "Manual" ...
## $ owner : chr "First Owner" "Second Owner" "Third Owner" "First Owner" ...
## $ mileage : chr "23.4 kmpl" "21.14 kmpl" "17.7 kmpl" "23.0 kmpl" ...
## $ engine : chr "1248 CC" "1498 CC" "1497 CC" "1396 CC" ...
## $ max_power : chr "74 bhp" "103.52 bhp" "78 bhp" "90 bhp" ...
## $ torque : chr "190Nm@ 2000rpm" "250Nm@ 1500-2500rpm" "12.7@ 2,700(kgm@ rpm)" "22.4 kgm at 1750-2750rpm" ...
## $ seats : int 5 5 5 5 5 5 5 4 5 5 ...
library(tidyr)
df$engine_cc <- as.numeric(str_split_fixed(df$engine, " ", 2)[,1])
df$max_power_bhp <- as.numeric(str_split_fixed(df$max_power, " ", 2)[,1])
df$mileage_kmpl <- as.numeric(str_split_fixed(df$mileage, " ", 2)[,1])
df$brand <- str_split_fixed(df$name, " ", 2)[,1]
df$age= 2021- as.numeric(df$year)
variables <- c("age","brand","selling_price","km_driven","fuel","seller_type","transmission","owner","seats","engine_cc","max_power_bhp","mileage_kmpl")
df_clean <- df[,variables]
str(df_clean)
## 'data.frame': 8128 obs. of 12 variables:
## $ age : num 7 7 15 11 14 4 14 20 10 8 ...
## $ brand : chr "Maruti" "Skoda" "Honda" "Hyundai" ...
## $ selling_price: int 450000 370000 158000 225000 130000 440000 96000 45000 350000 200000 ...
## $ km_driven : int 145500 120000 140000 127000 120000 45000 175000 5000 90000 169000 ...
## $ fuel : chr "Diesel" "Diesel" "Petrol" "Diesel" ...
## $ seller_type : chr "Individual" "Individual" "Individual" "Individual" ...
## $ transmission : chr "Manual" "Manual" "Manual" "Manual" ...
## $ owner : chr "First Owner" "Second Owner" "Third Owner" "First Owner" ...
## $ seats : int 5 5 5 5 5 5 5 4 5 5 ...
## $ engine_cc : num 1248 1498 1497 1396 1298 ...
## $ max_power_bhp: num 74 103.5 78 90 88.2 ...
## $ mileage_kmpl : num 23.4 21.1 17.7 23 16.1 ...
num_cols <- df_clean[,sapply(df_clean,is.numeric)]
summary(num_cols)
## age selling_price km_driven seats
## Min. : 1.000 Min. : 29999 Min. : 1 Min. : 2.000
## 1st Qu.: 4.000 1st Qu.: 254999 1st Qu.: 35000 1st Qu.: 5.000
## Median : 6.000 Median : 450000 Median : 60000 Median : 5.000
## Mean : 7.196 Mean : 638272 Mean : 69820 Mean : 5.417
## 3rd Qu.:10.000 3rd Qu.: 675000 3rd Qu.: 98000 3rd Qu.: 5.000
## Max. :38.000 Max. :10000000 Max. :2360457 Max. :14.000
## NA's :221
## engine_cc max_power_bhp mileage_kmpl
## Min. : 624 Min. : 0.00 Min. : 0.00
## 1st Qu.:1197 1st Qu.: 68.05 1st Qu.:16.78
## Median :1248 Median : 82.00 Median :19.30
## Mean :1459 Mean : 91.52 Mean :19.42
## 3rd Qu.:1582 3rd Qu.:102.00 3rd Qu.:22.32
## Max. :3604 Max. :400.00 Max. :42.00
## NA's :221 NA's :216 NA's :221
na.val <- lapply(df_clean,function(x) mean(is.na(x)) * 100)
na.df = data.frame( variable = names(na.val),
percentage.na =round(as.numeric( sapply(na.val, "[", 1) ),2))
print(na.df)
## variable percentage.na
## 1 age 0.00
## 2 brand 0.00
## 3 selling_price 0.00
## 4 km_driven 0.00
## 5 fuel 0.00
## 6 seller_type 0.00
## 7 transmission 0.00
## 8 owner 0.00
## 9 seats 2.72
## 10 engine_cc 2.72
## 11 max_power_bhp 2.66
## 12 mileage_kmpl 2.72
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
imp_brand <- df_clean %>% group_by(brand)%>% count() %>%arrange(desc(n)) %>% filter(n > 50)
df_model_clean <- df_clean[df_clean$brand %in% unique(imp_brand$brand),]
dim(df_model_clean)
## [1] 7928 12
df_model <- df_model_clean[!is.na(df_model_clean$seats) & !is.na(df_model_clean$max_power_bhp),]
dim(df_model)
## [1] 7713 12
check <- df_model_clean[is.na(df_model_clean$seats) & is.na(df_model_clean$max_power_bhp),]
head(check)
str(df_model_clean)
## 'data.frame': 7928 obs. of 12 variables:
## $ age : num 7 7 15 11 14 4 14 20 10 8 ...
## $ brand : chr "Maruti" "Skoda" "Honda" "Hyundai" ...
## $ selling_price: int 450000 370000 158000 225000 130000 440000 96000 45000 350000 200000 ...
## $ km_driven : int 145500 120000 140000 127000 120000 45000 175000 5000 90000 169000 ...
## $ fuel : chr "Diesel" "Diesel" "Petrol" "Diesel" ...
## $ seller_type : chr "Individual" "Individual" "Individual" "Individual" ...
## $ transmission : chr "Manual" "Manual" "Manual" "Manual" ...
## $ owner : chr "First Owner" "Second Owner" "Third Owner" "First Owner" ...
## $ seats : int 5 5 5 5 5 5 5 4 5 5 ...
## $ engine_cc : num 1248 1498 1497 1396 1298 ...
## $ max_power_bhp: num 74 103.5 78 90 88.2 ...
## $ mileage_kmpl : num 23.4 21.1 17.7 23 16.1 ...
cat_cols <- df_model[,sapply(df_model,is.character)]
lapply(cat_cols, function(x) table(x))
## $brand
## x
## BMW Chevrolet Datsun Ford Honda
## 118 230 65 388 466
## Hyundai Jaguar Mahindra Maruti Mercedes-Benz
## 1360 71 758 2367 54
## Nissan Renault Skoda Tata Toyota
## 81 228 104 719 452
## Volkswagen Volvo
## 185 67
##
## $fuel
## x
## CNG Diesel LPG Petrol
## 52 4171 35 3455
##
## $seller_type
## x
## Dealer Individual Trustmark Dealer
## 1031 6446 236
##
## $transmission
## x
## Automatic Manual
## 941 6772
##
## $owner
## x
## First Owner Fourth & Above Owner Second Owner
## 5069 159 1984
## Test Drive Car Third Owner
## 2 499
par(mfrow = c(1,2))
hist(df_model$selling_price, main= paste("selling_price"), breaks = 50,probability = TRUE)
hist(log(df_model$selling_price), main= paste("log of selling_price"), breaks = 50,probability = TRUE)
# library(hrbrthemes)
#
# for (feature in colnames(cat_cols)) {
#
# p <- (ggplot(data=df_model, aes(x= log(selling_price), group= df_model[,feature], fill=df_model[,feature])) +
# geom_density() +
# theme_ipsum())
#
# print(p + scale_fill_discrete(name = feature))
# }
p1 <- ggplot(df_model, aes(x=brand, y=log(selling_price), fill=brand)) +
geom_boxplot()+labs(title=paste0("Selling Price by brand"),x= 'brand', y = "Selling Price") + scale_fill_discrete(name = df_model$brand) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
p2 <- ggplot(df_model, aes(x=fuel, y=log(selling_price), fill=fuel)) +
geom_boxplot()+labs(title=paste0("Selling Price by fuel type"),x= 'fuel', y = "Selling Price") + scale_fill_discrete(name = df_model$fuel) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
p3 <- ggplot(df_model, aes(x=owner, y=log(selling_price), fill=owner)) +
geom_boxplot()+labs(title=paste0("Selling Price by owner"),x= 'owner', y = "Selling Price") + scale_fill_discrete(name = df_model$owner) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
p4 <- ggplot(df_model, aes(x=seller_type, y=log(selling_price), fill=seller_type)) +
geom_boxplot()+labs(title=paste0("Selling Price by seller_type"),x= 'seller_type', y = "Selling Price") + scale_fill_discrete(name = df_model$seller_type) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
p5 <- ggplot(df_model, aes(x=transmission, y=log(selling_price), fill=transmission)) +
geom_boxplot()+labs(title=paste0("Selling Price by transmission"),x= "transmission", y = "Selling Price") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
grid.arrange(p2,p3,p4,p5,ncol = 2)
library(dplyr)
df_model%>%
group_by(brand)%>%
count()%>%
arrange(desc(n))%>%
filter(n >90)%>%
ggplot()+geom_col(aes(x=n,y=reorder(brand,n),fill=brand),show.legend = FALSE)+
geom_label(aes(y = reorder(brand,n), x = n, label = paste(round((n/sum(n))*100,2),'%')))+
labs(title = 'Percentage share of Brands',
subtitle = '',
x= 'Percentage Share',
y='Company')
num_cols['log_sp'] <- log(num_cols$selling_price)
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:car':
##
## logit
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
pairs.panels(num_cols[,-1],
method = "pearson", # correlation method
hist.col = "#00AFBB",
density = T, # show density plots
ellipses = TRUE # show correlation ellipses
)
library(ggpubr)
p1 <- ggplot(df_model[df_model$km_driven < 5e5,], aes(x = km_driven, y = log(selling_price))) + geom_point(size=2, shape=23) + geom_smooth(method = "lm", se=FALSE, color="red", formula = y ~ x, size=1,fullrange=TRUE) + stat_cor(method = "pearson")
p2 <- ggplot(df_model, aes(x = age, y = log(selling_price))) + geom_point(size=2, shape=23) + geom_smooth(method = "lm", se=FALSE, color="red", formula = y ~ x, size=1,fullrange=TRUE) + stat_cor(method = "pearson")
p3 <- ggplot(df_model, aes(x = seats, y = log(selling_price))) + geom_point(size=2, shape=23) + geom_smooth(method = "lm", se=FALSE, color="red", formula = y ~ x, size=1,fullrange=TRUE) + stat_cor(method = "pearson")
p4 <- ggplot(df_model, aes(x = engine_cc, y = log(selling_price))) + geom_point(size=2, shape=23) + geom_smooth(method = "lm", se=FALSE, color="red", formula = y ~ x, size=1,fullrange=TRUE) + stat_cor(method = "pearson")
p5 <- ggplot(df_model, aes(x = max_power_bhp, y = log(selling_price))) + geom_point(size=2, shape=23) + geom_smooth(method = "lm", se=FALSE, color="red", formula = y ~ x, size=1,fullrange=TRUE) + stat_cor(method = "pearson")
p6 <- ggplot(df_model, aes(x = mileage_kmpl, y = log(selling_price))) + geom_point(size=2, shape=23) + geom_smooth(method = "lm", se=FALSE, color="red", formula = y ~ x, size=1,fullrange=TRUE) + stat_cor(method = "pearson")
grid.arrange(p1,p2,p3,p4,p5,p6,ncol = 2)
# grid.arrange(p3,p4,ncol = 2)
#
# grid.arrange(p5,p6,ncol = 2)
df_model[df_model$km_driven >= 1400000,]
## 75% of the sample size
smp_size <- floor(0.90 * nrow(df_model))
## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(df_model)), size = smp_size)
train <- df_model[train_ind, ]
test <- df_model[-train_ind, ]
dim(train)
## [1] 6941 12
dim(test)
## [1] 772 12
library(nortest)
options(scipen = 999)
model0 <- lm(selling_price ~ .,data = train)
summary(model0)
##
## Call:
## lm(formula = selling_price ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3331487 -100659 -10525 84701 5618801
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 3061483.60573 86798.48811 35.271
## age -42063.61774 1397.05789 -30.109
## brandChevrolet -2737092.62121 42481.78140 -64.430
## brandDatsun -2771706.50901 55744.85491 -49.721
## brandFord -2671164.01425 38924.12799 -68.625
## brandHonda -2662256.49132 37799.99259 -70.430
## brandHyundai -2665701.48926 36798.52180 -72.440
## brandJaguar -1212964.80295 47790.23552 -25.381
## brandMahindra -2650663.29388 39631.00892 -66.884
## brandMaruti -2579102.22228 37717.30825 -68.380
## brandMercedes-Benz -1518746.65643 52397.13327 -28.985
## brandNissan -2690308.73525 49467.45167 -54.385
## brandRenault -2686283.05223 42479.20797 -63.238
## brandSkoda -2698990.66527 45596.99991 -59.192
## brandTata -2772964.46671 37964.59638 -73.041
## brandToyota -2339451.06306 39565.94516 -59.128
## brandVolkswagen -2747436.32080 41752.55927 -65.803
## brandVolvo -853489.66020 48455.86584 -17.614
## km_driven -0.92773 0.09071 -10.228
## fuelDiesel 138921.82456 46927.21497 2.960
## fuelLPG 142308.83427 71073.60365 2.002
## fuelPetrol 23954.84944 47134.03953 0.508
## seller_typeIndividual -53617.61075 12145.30067 -4.415
## seller_typeTrustmark Dealer -59969.67963 25052.77826 -2.394
## transmissionManual -91904.88281 15050.03862 -6.107
## ownerFourth & Above Owner -10240.87886 26713.29725 -0.383
## ownerSecond Owner -57342.34573 9361.62193 -6.125
## ownerTest Drive Car 636677.82502 214130.61187 2.973
## ownerThird Owner -33109.30678 16056.32241 -2.062
## seats 451.40330 6201.60825 0.073
## engine_cc 22.71074 19.44683 1.168
## max_power_bhp 6289.80492 213.68493 29.435
## mileage_kmpl -3892.63349 1657.66269 -2.348
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## age < 0.0000000000000002 ***
## brandChevrolet < 0.0000000000000002 ***
## brandDatsun < 0.0000000000000002 ***
## brandFord < 0.0000000000000002 ***
## brandHonda < 0.0000000000000002 ***
## brandHyundai < 0.0000000000000002 ***
## brandJaguar < 0.0000000000000002 ***
## brandMahindra < 0.0000000000000002 ***
## brandMaruti < 0.0000000000000002 ***
## brandMercedes-Benz < 0.0000000000000002 ***
## brandNissan < 0.0000000000000002 ***
## brandRenault < 0.0000000000000002 ***
## brandSkoda < 0.0000000000000002 ***
## brandTata < 0.0000000000000002 ***
## brandToyota < 0.0000000000000002 ***
## brandVolkswagen < 0.0000000000000002 ***
## brandVolvo < 0.0000000000000002 ***
## km_driven < 0.0000000000000002 ***
## fuelDiesel 0.00308 **
## fuelLPG 0.04529 *
## fuelPetrol 0.61131
## seller_typeIndividual 0.000010269919 ***
## seller_typeTrustmark Dealer 0.01670 *
## transmissionManual 0.000000001073 ***
## ownerFourth & Above Owner 0.70146
## ownerSecond Owner 0.000000000955 ***
## ownerTest Drive Car 0.00296 **
## ownerThird Owner 0.03924 *
## seats 0.94198
## engine_cc 0.24291
## max_power_bhp < 0.0000000000000002 ***
## mileage_kmpl 0.01889 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 301800 on 6908 degrees of freedom
## Multiple R-squared: 0.8325, Adjusted R-squared: 0.8317
## F-statistic: 1073 on 32 and 6908 DF, p-value: < 0.00000000000000022
par(mfrow = c(2,2))
plot(model0, which = 1:4)
ncvTest(model0)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 35038.85, Df = 1, p = < 0.000000000000000222
ad.test(resid(model0))
##
## Anderson-Darling normality test
##
## data: resid(model0)
## A = 473.93, p-value < 0.00000000000000022
vif(model0)
## GVIF Df GVIF^(1/(2*Df))
## age 2.210890 1 1.486906
## brand 9.569932 16 1.073133
## km_driven 1.620535 1 1.273002
## fuel 2.806896 3 1.187694
## seller_type 1.540598 2 1.114095
## transmission 1.841868 1 1.357154
## owner 1.429861 4 1.045711
## seats 2.687588 1 1.639386
## engine_cc 6.969510 1 2.639983
## max_power_bhp 3.915497 1 1.978761
## mileage_kmpl 3.392585 1 1.841897
pt <- powerTransform(cbind(df_model$age,df_model$km_driven,df_model$engine_cc,df_model$max_power_bhp,(df_model$mileage_kmpl + 1)) ~ 1)
summary(pt)
## bcPower Transformations to Multinormality
## Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## Y1 0.2483 0.25 0.2182 0.2784
## Y2 0.3107 0.31 0.2940 0.3273
## Y3 -0.7397 -0.74 -0.7894 -0.6900
## Y4 -0.4298 -0.43 -0.4708 -0.3887
## Y5 1.1027 1.10 1.0471 1.1582
##
## Likelihood ratio test that transformation parameters are equal to 0
## (all log transformations)
## LRT df pval
## LR test, lambda = (0 0 0 0 0) 6559.984 5 < 0.000000000000000222
##
## Likelihood ratio test that no transformations are needed
## LRT df pval
## LR test, lambda = (1 1 1 1 1) 15589.43 5 < 0.000000000000000222
model_tra <- lm(selling_price ~ I(train$age ^0.25) + I(train$km_driven ^0.31)+ I(train$engine_cc^-0.74) + I(train$max_power_bhp^-.43) + I(train$mileage_kmpl^1.10) + train$brand + train$fuel + train$owner + train$seller_type, data = train)
boxCox(model_tra, lambda=seq(-2, 2, by=0.5))
ad.test(resid(model_tra))
##
## Anderson-Darling normality test
##
## data: resid(model_tra)
## A = 486.09, p-value < 0.00000000000000022
ncvTest(model_tra)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 27863.8, Df = 1, p = < 0.000000000000000222
model_transf <- lm(log(selling_price) ~ I(train$age ^0.25) + I(train$km_driven ^0.31)+ I(train$engine_cc^-0.74) + I(train$max_power_bhp^-.43) + I(train$mileage_kmpl^1.10) + train$brand + train$fuel + train$owner + train$seller_type, data = train)
summary(model_transf)
##
## Call:
## lm(formula = log(selling_price) ~ I(train$age^0.25) + I(train$km_driven^0.31) +
## I(train$engine_cc^-0.74) + I(train$max_power_bhp^-0.43) +
## I(train$mileage_kmpl^1.1) + train$brand + train$fuel + train$owner +
## train$seller_type, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.68999 -0.15032 0.02182 0.16612 1.27555
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 18.9737871 0.0599518 316.484
## I(train$age^0.25) -1.7939047 0.0223920 -80.114
## I(train$km_driven^0.31) -0.0026144 0.0006590 -3.967
## I(train$engine_cc^-0.74) -84.2468820 8.2372023 -10.228
## I(train$max_power_bhp^-0.43) -12.6600487 0.3125336 -40.508
## I(train$mileage_kmpl^1.1) -0.0001428 0.0009052 -0.158
## train$brandChevrolet -1.1408342 0.0340716 -33.483
## train$brandDatsun -1.1415710 0.0461392 -24.742
## train$brandFord -0.9776257 0.0312521 -31.282
## train$brandHonda -0.8309816 0.0308041 -26.976
## train$brandHyundai -0.8654984 0.0292005 -29.640
## train$brandJaguar -0.0698798 0.0418248 -1.671
## train$brandMahindra -0.8795154 0.0305598 -28.780
## train$brandMaruti -0.7930701 0.0298014 -26.612
## train$brandMercedes-Benz -0.0381083 0.0459314 -0.830
## train$brandNissan -0.8912754 0.0413645 -21.547
## train$brandRenault -0.8740731 0.0342687 -25.506
## train$brandSkoda -0.8956728 0.0384576 -23.290
## train$brandTata -1.2590186 0.0301269 -41.791
## train$brandToyota -0.5033485 0.0314215 -16.019
## train$brandVolkswagen -0.9400152 0.0342168 -27.472
## train$brandVolvo -0.2294982 0.0423882 -5.414
## train$fuelDiesel 0.2594932 0.0411919 6.300
## train$fuelLPG 0.1683162 0.0623226 2.701
## train$fuelPetrol 0.0220839 0.0414781 0.532
## train$ownerFourth & Above Owner -0.2608520 0.0232368 -11.226
## train$ownerSecond Owner -0.0923323 0.0082316 -11.217
## train$ownerTest Drive Car 0.2222491 0.1877271 1.184
## train$ownerThird Owner -0.1769434 0.0140122 -12.628
## train$seller_typeIndividual -0.0439739 0.0105924 -4.151
## train$seller_typeTrustmark Dealer -0.0353604 0.0218303 -1.620
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## I(train$age^0.25) < 0.0000000000000002 ***
## I(train$km_driven^0.31) 0.000073402337 ***
## I(train$engine_cc^-0.74) < 0.0000000000000002 ***
## I(train$max_power_bhp^-0.43) < 0.0000000000000002 ***
## I(train$mileage_kmpl^1.1) 0.87468
## train$brandChevrolet < 0.0000000000000002 ***
## train$brandDatsun < 0.0000000000000002 ***
## train$brandFord < 0.0000000000000002 ***
## train$brandHonda < 0.0000000000000002 ***
## train$brandHyundai < 0.0000000000000002 ***
## train$brandJaguar 0.09481 .
## train$brandMahindra < 0.0000000000000002 ***
## train$brandMaruti < 0.0000000000000002 ***
## train$brandMercedes-Benz 0.40675
## train$brandNissan < 0.0000000000000002 ***
## train$brandRenault < 0.0000000000000002 ***
## train$brandSkoda < 0.0000000000000002 ***
## train$brandTata < 0.0000000000000002 ***
## train$brandToyota < 0.0000000000000002 ***
## train$brandVolkswagen < 0.0000000000000002 ***
## train$brandVolvo 0.000000063635 ***
## train$fuelDiesel 0.000000000317 ***
## train$fuelLPG 0.00694 **
## train$fuelPetrol 0.59445
## train$ownerFourth & Above Owner < 0.0000000000000002 ***
## train$ownerSecond Owner < 0.0000000000000002 ***
## train$ownerTest Drive Car 0.23650
## train$ownerThird Owner < 0.0000000000000002 ***
## train$seller_typeIndividual 0.000033433733 ***
## train$seller_typeTrustmark Dealer 0.10532
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2645 on 6910 degrees of freedom
## Multiple R-squared: 0.8919, Adjusted R-squared: 0.8914
## F-statistic: 1900 on 30 and 6910 DF, p-value: < 0.00000000000000022
par(mfrow = c(2,2))
plot(model_transf, which = 1:4)
ncvTest(model_transf)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 107.2202, Df = 1, p = < 0.000000000000000222
ad.test(resid(model_transf))
##
## Anderson-Darling normality test
##
## data: resid(model_transf)
## A = 24.528, p-value < 0.00000000000000022
vif(model_transf)
## GVIF Df GVIF^(1/(2*Df))
## I(train$age^0.25) 2.520715 1 1.587676
## I(train$km_driven^0.31) 2.213487 1 1.487779
## I(train$engine_cc^-0.74) 8.103191 1 2.846610
## I(train$max_power_bhp^-0.43) 4.486294 1 2.118087
## I(train$mileage_kmpl^1.1) 2.856638 1 1.690159
## train$brand 5.237449 16 1.053107
## train$fuel 3.153256 3 1.210951
## train$owner 1.414022 4 1.044256
## train$seller_type 1.487383 2 1.104347
model1 <- lm(log(selling_price) ~ .,data = train)
summary(model1)
##
## Call:
## lm(formula = log(selling_price) ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3324 -0.1422 0.0100 0.1589 1.7701
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 12.99812307225 0.06975240296 186.347
## age -0.11357743584 0.00112269403 -101.165
## brandChevrolet -0.88910954524 0.03413891646 -26.044
## brandDatsun -0.86895832328 0.04479729621 -19.398
## brandFord -0.62458897027 0.03127993954 -19.968
## brandHonda -0.47251440806 0.03037656959 -15.555
## brandHyundai -0.53439724103 0.02957177454 -18.071
## brandJaguar -0.09943444766 0.03840485978 -2.589
## brandMahindra -0.67859782519 0.03184799833 -21.307
## brandMaruti -0.48158143322 0.03031012338 -15.888
## brandMercedes-Benz -0.12091519804 0.04210702321 -2.872
## brandNissan -0.56763036587 0.03975269268 -14.279
## brandRenault -0.60800562600 0.03413684842 -17.811
## brandSkoda -0.56922536839 0.03664234689 -15.535
## brandTata -0.95627132244 0.03050884736 -31.344
## brandToyota -0.31663448806 0.03179571223 -9.958
## brandVolkswagen -0.59921359521 0.03355290400 -17.859
## brandVolvo -0.11653870399 0.03893976904 -2.993
## km_driven -0.00000068040 0.00000007289 -9.334
## fuelDiesel 0.31995684072 0.03771132516 8.484
## fuelLPG 0.17988260526 0.05711567966 3.149
## fuelPetrol 0.08489589000 0.03787753209 2.241
## seller_typeIndividual -0.03623107511 0.00976012284 -3.712
## seller_typeTrustmark Dealer -0.01163701178 0.02013274105 -0.578
## transmissionManual -0.06100781358 0.01209440834 -5.044
## ownerFourth & Above Owner -0.14471406887 0.02146715587 -6.741
## ownerSecond Owner -0.08229654556 0.00752312211 -10.939
## ownerTest Drive Car 0.32067409601 0.17207816691 1.864
## ownerThird Owner -0.12306518514 0.01290307118 -9.538
## seats 0.04059985615 0.00498369369 8.147
## engine_cc 0.00020696456 0.00001562773 13.243
## max_power_bhp 0.00839131801 0.00017172001 48.866
## mileage_kmpl 0.00050418568 0.00133211947 0.378
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## age < 0.0000000000000002 ***
## brandChevrolet < 0.0000000000000002 ***
## brandDatsun < 0.0000000000000002 ***
## brandFord < 0.0000000000000002 ***
## brandHonda < 0.0000000000000002 ***
## brandHyundai < 0.0000000000000002 ***
## brandJaguar 0.009643 **
## brandMahindra < 0.0000000000000002 ***
## brandMaruti < 0.0000000000000002 ***
## brandMercedes-Benz 0.004096 **
## brandNissan < 0.0000000000000002 ***
## brandRenault < 0.0000000000000002 ***
## brandSkoda < 0.0000000000000002 ***
## brandTata < 0.0000000000000002 ***
## brandToyota < 0.0000000000000002 ***
## brandVolkswagen < 0.0000000000000002 ***
## brandVolvo 0.002774 **
## km_driven < 0.0000000000000002 ***
## fuelDiesel < 0.0000000000000002 ***
## fuelLPG 0.001643 **
## fuelPetrol 0.025037 *
## seller_typeIndividual 0.000207 ***
## seller_typeTrustmark Dealer 0.563273
## transmissionManual 0.000000466795935124 ***
## ownerFourth & Above Owner 0.000000000016978633 ***
## ownerSecond Owner < 0.0000000000000002 ***
## ownerTest Drive Car 0.062429 .
## ownerThird Owner < 0.0000000000000002 ***
## seats 0.000000000000000441 ***
## engine_cc < 0.0000000000000002 ***
## max_power_bhp < 0.0000000000000002 ***
## mileage_kmpl 0.705083
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2425 on 6908 degrees of freedom
## Multiple R-squared: 0.9091, Adjusted R-squared: 0.9087
## F-statistic: 2160 on 32 and 6908 DF, p-value: < 0.00000000000000022
par(mfrow = c(2,2))
plot(model1, which = 1:4)
ncvTest(model1)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 319.8596, Df = 1, p = < 0.000000000000000222
ad.test(resid(model1))
##
## Anderson-Darling normality test
##
## data: resid(model1)
## A = 15.926, p-value < 0.00000000000000022
vif(model1)
## GVIF Df GVIF^(1/(2*Df))
## age 2.210890 1 1.486906
## brand 9.569932 16 1.073133
## km_driven 1.620535 1 1.273002
## fuel 2.806896 3 1.187694
## seller_type 1.540598 2 1.114095
## transmission 1.841868 1 1.357154
## owner 1.429861 4 1.045711
## seats 2.687588 1 1.639386
## engine_cc 6.969510 1 2.639983
## max_power_bhp 3.915497 1 1.978761
## mileage_kmpl 3.392585 1 1.841897
mod.0 <- lm(log(selling_price) ~ 1, data = train)
#step(mod.0, scope = list(lower = mod.0, upper = model0), direction = "forward")
n <- length(train$selling_price)
step(model0, scope = list(lower = mod.0, upper = model1), direction = 'backward',k = log(n), trace = 0)
##
## Call:
## lm(formula = selling_price ~ age + brand + km_driven + fuel +
## seller_type + transmission + owner + max_power_bhp + mileage_kmpl,
## data = train)
##
## Coefficients:
## (Intercept) age
## 3102019.2910 -42192.1610
## brandChevrolet brandDatsun
## -2733398.4212 -2765912.0782
## brandFord brandHonda
## -2666546.4486 -2657058.4051
## brandHyundai brandJaguar
## -2662992.1031 -1213703.0368
## brandMahindra brandMaruti
## -2638380.5967 -2575923.5655
## brandMercedes-Benz brandNissan
## -1515664.3280 -2685859.6358
## brandRenault brandSkoda
## -2683177.1011 -2694586.7050
## brandTata brandToyota
## -2768618.2629 -2325529.1950
## brandVolkswagen brandVolvo
## -2745160.0468 -854009.7271
## km_driven fuelDiesel
## -0.9175 140544.6275
## fuelLPG fuelPetrol
## 136194.9250 17761.8281
## seller_typeIndividual seller_typeTrustmark Dealer
## -53513.9336 -59434.7742
## transmissionManual ownerFourth & Above Owner
## -92733.5629 -10735.2800
## ownerSecond Owner ownerTest Drive Car
## -57459.7598 637694.1058
## ownerThird Owner max_power_bhp
## -33411.5063 6420.6067
## mileage_kmpl
## -4873.5412
n <- length(train$selling_price)
step(model0, scope = list(lower = mod.0, upper = model1), direction = 'forward',k = log(n), trace = 0)
##
## Call:
## lm(formula = selling_price ~ age + brand + km_driven + fuel +
## seller_type + transmission + owner + seats + engine_cc +
## max_power_bhp + mileage_kmpl, data = train)
##
## Coefficients:
## (Intercept) age
## 3061483.6057 -42063.6177
## brandChevrolet brandDatsun
## -2737092.6212 -2771706.5090
## brandFord brandHonda
## -2671164.0142 -2662256.4913
## brandHyundai brandJaguar
## -2665701.4893 -1212964.8030
## brandMahindra brandMaruti
## -2650663.2939 -2579102.2223
## brandMercedes-Benz brandNissan
## -1518746.6564 -2690308.7353
## brandRenault brandSkoda
## -2686283.0522 -2698990.6653
## brandTata brandToyota
## -2772964.4667 -2339451.0631
## brandVolkswagen brandVolvo
## -2747436.3208 -853489.6602
## km_driven fuelDiesel
## -0.9277 138921.8246
## fuelLPG fuelPetrol
## 142308.8343 23954.8494
## seller_typeIndividual seller_typeTrustmark Dealer
## -53617.6108 -59969.6796
## transmissionManual ownerFourth & Above Owner
## -91904.8828 -10240.8789
## ownerSecond Owner ownerTest Drive Car
## -57342.3457 636677.8250
## ownerThird Owner seats
## -33109.3068 451.4033
## engine_cc max_power_bhp
## 22.7107 6289.8049
## mileage_kmpl
## -3892.6335
model_back <- lm(log(selling_price) ~ age + brand + km_driven + fuel +
seller_type + transmission + owner + max_power_bhp + engine_cc + seats,
data = train)
model_forward <- lm(log(selling_price) ~ age + brand + km_driven + fuel +
seller_type + transmission + owner + seats + engine_cc +
max_power_bhp + mileage_kmpl,data = train)
anova(model_back,model_forward)
model_sub <- lm(log(selling_price) ~ age + brand + km_driven + fuel + seller_type +
transmission + owner + max_power_bhp + engine_cc + seats, data = train)
summary(model_sub)
##
## Call:
## lm(formula = log(selling_price) ~ age + brand + km_driven + fuel +
## seller_type + transmission + owner + max_power_bhp + engine_cc +
## seats, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.33270 -0.14294 0.01098 0.15852 1.77240
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 13.01536516209 0.05281790747 246.420
## age -0.11375660148 0.00101795252 -111.750
## brandChevrolet -0.88882464124 0.03412850026 -26.043
## brandDatsun -0.86793271727 0.04471249182 -19.411
## brandFord -0.62435269230 0.03127177028 -19.965
## brandHonda -0.47151209898 0.03025903746 -15.583
## brandHyundai -0.53402092293 0.02955322220 -18.070
## brandJaguar -0.10016839564 0.03835349275 -2.612
## brandMahindra -0.67866807750 0.03184548274 -21.311
## brandMaruti -0.48052820564 0.03018024011 -15.922
## brandMercedes-Benz -0.12153438725 0.04207262136 -2.889
## brandNissan -0.56745037321 0.03974738333 -14.276
## brandRenault -0.60723962439 0.03407468822 -17.821
## brandSkoda -0.56937580843 0.03663791913 -15.541
## brandTata -0.95628032895 0.03050694640 -31.346
## brandToyota -0.31637682809 0.03178645250 -9.953
## brandVolkswagen -0.59937749497 0.03354802914 -17.866
## brandVolvo -0.11715221646 0.03890360333 -3.011
## km_driven -0.00000068017 0.00000007289 -9.332
## fuelDiesel 0.32042092384 0.03768904883 8.502
## fuelLPG 0.17806342174 0.05690954868 3.129
## fuelPetrol 0.08316591712 0.03759840327 2.212
## seller_typeIndividual -0.03618362669 0.00975871256 -3.708
## seller_typeTrustmark Dealer -0.01148104238 0.02012727509 -0.570
## transmissionManual -0.06079112370 0.01208010077 -5.032
## ownerFourth & Above Owner -0.14489219553 0.02146066560 -6.752
## ownerSecond Owner -0.08236333284 0.00752058599 -10.952
## ownerTest Drive Car 0.32041910687 0.17206617855 1.862
## ownerThird Owner -0.12322823567 0.01289507796 -9.556
## max_power_bhp 0.00838467109 0.00017080902 49.088
## engine_cc 0.00020496403 0.00001470581 13.938
## seats 0.04007367034 0.00478555924 8.374
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## age < 0.0000000000000002 ***
## brandChevrolet < 0.0000000000000002 ***
## brandDatsun < 0.0000000000000002 ***
## brandFord < 0.0000000000000002 ***
## brandHonda < 0.0000000000000002 ***
## brandHyundai < 0.0000000000000002 ***
## brandJaguar 0.009028 **
## brandMahindra < 0.0000000000000002 ***
## brandMaruti < 0.0000000000000002 ***
## brandMercedes-Benz 0.003881 **
## brandNissan < 0.0000000000000002 ***
## brandRenault < 0.0000000000000002 ***
## brandSkoda < 0.0000000000000002 ***
## brandTata < 0.0000000000000002 ***
## brandToyota < 0.0000000000000002 ***
## brandVolkswagen < 0.0000000000000002 ***
## brandVolvo 0.002610 **
## km_driven < 0.0000000000000002 ***
## fuelDiesel < 0.0000000000000002 ***
## fuelLPG 0.001762 **
## fuelPetrol 0.027002 *
## seller_typeIndividual 0.000211 ***
## seller_typeTrustmark Dealer 0.568410
## transmissionManual 0.0000004967817 ***
## ownerFourth & Above Owner 0.0000000000158 ***
## ownerSecond Owner < 0.0000000000000002 ***
## ownerTest Drive Car 0.062619 .
## ownerThird Owner < 0.0000000000000002 ***
## max_power_bhp < 0.0000000000000002 ***
## engine_cc < 0.0000000000000002 ***
## seats < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2425 on 6909 degrees of freedom
## Multiple R-squared: 0.9091, Adjusted R-squared: 0.9087
## F-statistic: 2230 on 31 and 6909 DF, p-value: < 0.00000000000000022
par(mfrow = c(2,2))
plot(model_sub, which = 1:4)
ncvTest(model_sub)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 321.3591, Df = 1, p = < 0.000000000000000222
ad.test(resid(model_sub))
##
## Anderson-Darling normality test
##
## data: resid(model_sub)
## A = 15.954, p-value < 0.00000000000000022
vif(model_sub)
## GVIF Df GVIF^(1/(2*Df))
## age 1.817830 1 1.348269
## brand 8.501065 16 1.069168
## km_driven 1.620422 1 1.272958
## fuel 1.796055 3 1.102520
## seller_type 1.539902 2 1.113969
## transmission 1.837740 1 1.355633
## owner 1.427641 4 1.045508
## max_power_bhp 3.874543 1 1.968386
## engine_cc 6.172238 1 2.484399
## seats 2.478445 1 1.574308
p <- ncol(train) - 1
n <- nrow(train)
nyc.hats <- hatvalues(model_sub)
sum(nyc.hats)
## [1] 32
nyc.std <- rstandard(model_sub)
plot(hatvalues(model_sub), rstandard(model_sub),
xlab='Leverage', ylab='Standardized Residuals')
abline(v = 3*(p+1)/n , lty = 2, lwd = 2, col = "red")
abline(h = c(-2, 2), lty = 2, lwd = 2, col = "blue")
influenceIndexPlot(model_sub)
train$rownames <- rownames(train)
train_new <- train[!train$rownames %in% c(1811,8043),]
model3 <- lm(log(train_new$selling_price) ~ age + brand + km_driven + fuel + seller_type +
transmission + owner + max_power_bhp + engine_cc + seats ,data = train_new)
summary(model3)
##
## Call:
## lm(formula = log(train_new$selling_price) ~ age + brand + km_driven +
## fuel + seller_type + transmission + owner + max_power_bhp +
## engine_cc + seats, data = train_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.33035 -0.14322 0.00968 0.15902 1.19459
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 13.02233392528 0.05260083234 247.569
## age -0.11364691633 0.00102649202 -110.714
## brandChevrolet -0.88514134033 0.03397876466 -26.050
## brandDatsun -0.86559390742 0.04451407491 -19.445
## brandFord -0.62240499283 0.03113570004 -19.990
## brandHonda -0.46807927854 0.03013314627 -15.534
## brandHyundai -0.53138360875 0.02942702348 -18.058
## brandJaguar -0.10005489414 0.03817988155 -2.621
## brandMahindra -0.67668038378 0.03170265998 -21.345
## brandMaruti -0.47831172601 0.03004972112 -15.917
## brandMercedes-Benz -0.11995582609 0.04188307818 -2.864
## brandNissan -0.56515129771 0.03957374184 -14.281
## brandRenault -0.60428557207 0.03392945927 -17.810
## brandSkoda -0.56621590180 0.03648137001 -15.521
## brandTata -0.95324828295 0.03037560519 -31.382
## brandToyota -0.31134213248 0.03166041134 -9.834
## brandVolkswagen -0.59709057474 0.03340295261 -17.875
## brandVolvo -0.11911103542 0.03872947690 -3.075
## km_driven -0.00000079511 0.00000008001 -9.938
## fuelDiesel 0.32209747170 0.03751971291 8.585
## fuelLPG 0.17980083124 0.05665248347 3.174
## fuelPetrol 0.08136316798 0.03743320358 2.174
## seller_typeIndividual -0.03469555730 0.00972096195 -3.569
## seller_typeTrustmark Dealer -0.01227408108 0.02003644214 -0.613
## transmissionManual -0.05974762545 0.01202659808 -4.968
## ownerFourth & Above Owner -0.14123604346 0.02136898156 -6.609
## ownerSecond Owner -0.08110903269 0.00749472146 -10.822
## ownerTest Drive Car 0.31705973078 0.17128824151 1.851
## ownerThird Owner -0.11970570061 0.01284936147 -9.316
## max_power_bhp 0.00833691315 0.00017018996 48.986
## engine_cc 0.00021037632 0.00001466109 14.349
## seats 0.03839224170 0.00477306167 8.044
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## age < 0.0000000000000002 ***
## brandChevrolet < 0.0000000000000002 ***
## brandDatsun < 0.0000000000000002 ***
## brandFord < 0.0000000000000002 ***
## brandHonda < 0.0000000000000002 ***
## brandHyundai < 0.0000000000000002 ***
## brandJaguar 0.008796 **
## brandMahindra < 0.0000000000000002 ***
## brandMaruti < 0.0000000000000002 ***
## brandMercedes-Benz 0.004195 **
## brandNissan < 0.0000000000000002 ***
## brandRenault < 0.0000000000000002 ***
## brandSkoda < 0.0000000000000002 ***
## brandTata < 0.0000000000000002 ***
## brandToyota < 0.0000000000000002 ***
## brandVolkswagen < 0.0000000000000002 ***
## brandVolvo 0.002110 **
## km_driven < 0.0000000000000002 ***
## fuelDiesel < 0.0000000000000002 ***
## fuelLPG 0.001511 **
## fuelPetrol 0.029772 *
## seller_typeIndividual 0.000361 ***
## seller_typeTrustmark Dealer 0.540169
## transmissionManual 0.00000069288170521 ***
## ownerFourth & Above Owner 0.00000000004146622 ***
## ownerSecond Owner < 0.0000000000000002 ***
## ownerTest Drive Car 0.064208 .
## ownerThird Owner < 0.0000000000000002 ***
## max_power_bhp < 0.0000000000000002 ***
## engine_cc < 0.0000000000000002 ***
## seats 0.00000000000000102 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2414 on 6907 degrees of freedom
## Multiple R-squared: 0.91, Adjusted R-squared: 0.9096
## F-statistic: 2252 on 31 and 6907 DF, p-value: < 0.00000000000000022
par(mfrow = c(2,2))
plot(model3,1:4)
nrow(train) - nrow(train_new)
## [1] 2
influenceIndexPlot(model3)
ad.test(rstandard(model3))
##
## Anderson-Darling normality test
##
## data: rstandard(model3)
## A = 15.119, p-value < 0.00000000000000022
ncvTest(model3)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 284.682, Df = 1, p = < 0.000000000000000222
wts <- 1/fitted(lm(abs(residuals(model3)) ~ train_new$age + train_new$km_driven + train_new$fuel + train_new$brand +
train_new$seller_type + train_new$transmission + train_new$owner + train_new$engine_cc + train_new$max_power_bhp + train_new$seats ))^2
model4 <- lm(log(selling_price) ~age + brand + km_driven + fuel + seller_type +
transmission + owner + max_power_bhp + engine_cc + seats ,data = train_new, weights = wts)
summary(model4)
##
## Call:
## lm(formula = log(selling_price) ~ age + brand + km_driven + fuel +
## seller_type + transmission + owner + max_power_bhp + engine_cc +
## seats, data = train_new, weights = wts)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -8.9443 -0.7879 0.0879 0.8481 7.5929
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 12.86929111766 0.04328225451 297.334
## age -0.11144513709 0.00109492909 -101.783
## brandChevrolet -0.85693028700 0.03772147502 -22.717
## brandDatsun -0.80919189675 0.04302115385 -18.809
## brandFord -0.60942297344 0.03374022495 -18.062
## brandHonda -0.45467246751 0.03265215052 -13.925
## brandHyundai -0.51095508730 0.03220131693 -15.868
## brandJaguar -0.12579404392 0.03573367751 -3.520
## brandMahindra -0.67916694401 0.03415687225 -19.884
## brandMaruti -0.44511486583 0.03265517021 -13.631
## brandMercedes-Benz -0.13282606737 0.04645151648 -2.859
## brandNissan -0.53892802742 0.03923312231 -13.737
## brandRenault -0.56991191045 0.03525421407 -16.166
## brandSkoda -0.55213014465 0.03594755580 -15.359
## brandTata -0.91513532321 0.03427608577 -26.699
## brandToyota -0.35174919625 0.03404701190 -10.331
## brandVolkswagen -0.60108843041 0.03603536658 -16.681
## brandVolvo -0.16121133740 0.02863378818 -5.630
## km_driven -0.00000082688 0.00000008133 -10.167
## fuelDiesel 0.33156921750 0.02028136967 16.348
## fuelLPG 0.19733134490 0.04881256462 4.043
## fuelPetrol 0.08484892016 0.02028974721 4.182
## seller_typeIndividual -0.01412861895 0.00651577913 -2.168
## seller_typeTrustmark Dealer 0.00994122314 0.01397883190 0.711
## transmissionManual -0.05431771848 0.01004404071 -5.408
## ownerFourth & Above Owner -0.14475755425 0.02578514446 -5.614
## ownerSecond Owner -0.07051815776 0.00692428385 -10.184
## ownerTest Drive Car 0.30674188674 0.06071262280 5.052
## ownerThird Owner -0.11609232801 0.01413127795 -8.215
## max_power_bhp 0.00912165092 0.00014554820 62.671
## engine_cc 0.00024156756 0.00001330299 18.159
## seats 0.03382609170 0.00453509744 7.459
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## age < 0.0000000000000002 ***
## brandChevrolet < 0.0000000000000002 ***
## brandDatsun < 0.0000000000000002 ***
## brandFord < 0.0000000000000002 ***
## brandHonda < 0.0000000000000002 ***
## brandHyundai < 0.0000000000000002 ***
## brandJaguar 0.000434 ***
## brandMahindra < 0.0000000000000002 ***
## brandMaruti < 0.0000000000000002 ***
## brandMercedes-Benz 0.004256 **
## brandNissan < 0.0000000000000002 ***
## brandRenault < 0.0000000000000002 ***
## brandSkoda < 0.0000000000000002 ***
## brandTata < 0.0000000000000002 ***
## brandToyota < 0.0000000000000002 ***
## brandVolkswagen < 0.0000000000000002 ***
## brandVolvo 0.000000018716836051 ***
## km_driven < 0.0000000000000002 ***
## fuelDiesel < 0.0000000000000002 ***
## fuelLPG 0.000053428475903489 ***
## fuelPetrol 0.000029270716099603 ***
## seller_typeIndividual 0.030165 *
## seller_typeTrustmark Dealer 0.477007
## transmissionManual 0.000000065884386340 ***
## ownerFourth & Above Owner 0.000000020538966060 ***
## ownerSecond Owner < 0.0000000000000002 ***
## ownerTest Drive Car 0.000000447590480613 ***
## ownerThird Owner 0.000000000000000251 ***
## max_power_bhp < 0.0000000000000002 ***
## engine_cc < 0.0000000000000002 ***
## seats 0.000000000000098043 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.269 on 6907 degrees of freedom
## Multiple R-squared: 0.9678, Adjusted R-squared: 0.9676
## F-statistic: 6695 on 31 and 6907 DF, p-value: < 0.00000000000000022
par(mfrow = c(2,2))
plot(model4,1:4)
influenceIndexPlot(model4)
library(nortest)
ad.test(rstandard(model4))
##
## Anderson-Darling normality test
##
## data: rstandard(model4)
## A = 11.783, p-value < 0.00000000000000022
ncvTest(model4)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 3.826009, Df = 1, p = 0.050463
hist(resid(model4))
vif(model4)
## GVIF Df GVIF^(1/(2*Df))
## age 2.423701 1 1.556824
## brand 39.938858 16 1.122131
## km_driven 2.885549 1 1.698690
## fuel 2.372212 3 1.154850
## seller_type 3.184873 2 1.335897
## transmission 5.701131 1 2.387704
## owner 1.668984 4 1.066121
## max_power_bhp 8.033941 1 2.834421
## engine_cc 8.304141 1 2.881691
## seats 2.367013 1 1.538510
wts <- 1/fitted(lm(abs(residuals(model3)) ~ train_new$age + train_new$km_driven + train_new$fuel + train_new$brand +
train_new$seller_type + train_new$transmission + train_new$owner + train_new$max_power_bhp + train_new$seats ))^2
model5 <- lm(log(selling_price) ~age + brand + km_driven + fuel + seller_type +
transmission + owner + max_power_bhp + seats ,data = train_new, weights = wts)
summary(model4)
##
## Call:
## lm(formula = log(selling_price) ~ age + brand + km_driven + fuel +
## seller_type + transmission + owner + max_power_bhp + engine_cc +
## seats, data = train_new, weights = wts)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -8.9443 -0.7879 0.0879 0.8481 7.5929
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 12.86929111766 0.04328225451 297.334
## age -0.11144513709 0.00109492909 -101.783
## brandChevrolet -0.85693028700 0.03772147502 -22.717
## brandDatsun -0.80919189675 0.04302115385 -18.809
## brandFord -0.60942297344 0.03374022495 -18.062
## brandHonda -0.45467246751 0.03265215052 -13.925
## brandHyundai -0.51095508730 0.03220131693 -15.868
## brandJaguar -0.12579404392 0.03573367751 -3.520
## brandMahindra -0.67916694401 0.03415687225 -19.884
## brandMaruti -0.44511486583 0.03265517021 -13.631
## brandMercedes-Benz -0.13282606737 0.04645151648 -2.859
## brandNissan -0.53892802742 0.03923312231 -13.737
## brandRenault -0.56991191045 0.03525421407 -16.166
## brandSkoda -0.55213014465 0.03594755580 -15.359
## brandTata -0.91513532321 0.03427608577 -26.699
## brandToyota -0.35174919625 0.03404701190 -10.331
## brandVolkswagen -0.60108843041 0.03603536658 -16.681
## brandVolvo -0.16121133740 0.02863378818 -5.630
## km_driven -0.00000082688 0.00000008133 -10.167
## fuelDiesel 0.33156921750 0.02028136967 16.348
## fuelLPG 0.19733134490 0.04881256462 4.043
## fuelPetrol 0.08484892016 0.02028974721 4.182
## seller_typeIndividual -0.01412861895 0.00651577913 -2.168
## seller_typeTrustmark Dealer 0.00994122314 0.01397883190 0.711
## transmissionManual -0.05431771848 0.01004404071 -5.408
## ownerFourth & Above Owner -0.14475755425 0.02578514446 -5.614
## ownerSecond Owner -0.07051815776 0.00692428385 -10.184
## ownerTest Drive Car 0.30674188674 0.06071262280 5.052
## ownerThird Owner -0.11609232801 0.01413127795 -8.215
## max_power_bhp 0.00912165092 0.00014554820 62.671
## engine_cc 0.00024156756 0.00001330299 18.159
## seats 0.03382609170 0.00453509744 7.459
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## age < 0.0000000000000002 ***
## brandChevrolet < 0.0000000000000002 ***
## brandDatsun < 0.0000000000000002 ***
## brandFord < 0.0000000000000002 ***
## brandHonda < 0.0000000000000002 ***
## brandHyundai < 0.0000000000000002 ***
## brandJaguar 0.000434 ***
## brandMahindra < 0.0000000000000002 ***
## brandMaruti < 0.0000000000000002 ***
## brandMercedes-Benz 0.004256 **
## brandNissan < 0.0000000000000002 ***
## brandRenault < 0.0000000000000002 ***
## brandSkoda < 0.0000000000000002 ***
## brandTata < 0.0000000000000002 ***
## brandToyota < 0.0000000000000002 ***
## brandVolkswagen < 0.0000000000000002 ***
## brandVolvo 0.000000018716836051 ***
## km_driven < 0.0000000000000002 ***
## fuelDiesel < 0.0000000000000002 ***
## fuelLPG 0.000053428475903489 ***
## fuelPetrol 0.000029270716099603 ***
## seller_typeIndividual 0.030165 *
## seller_typeTrustmark Dealer 0.477007
## transmissionManual 0.000000065884386340 ***
## ownerFourth & Above Owner 0.000000020538966060 ***
## ownerSecond Owner < 0.0000000000000002 ***
## ownerTest Drive Car 0.000000447590480613 ***
## ownerThird Owner 0.000000000000000251 ***
## max_power_bhp < 0.0000000000000002 ***
## engine_cc < 0.0000000000000002 ***
## seats 0.000000000000098043 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.269 on 6907 degrees of freedom
## Multiple R-squared: 0.9678, Adjusted R-squared: 0.9676
## F-statistic: 6695 on 31 and 6907 DF, p-value: < 0.00000000000000022
par(mfrow = c(2,2))
plot(model4,1:4)
influenceIndexPlot(model5)
ad.test(rstandard(model5))
##
## Anderson-Darling normality test
##
## data: rstandard(model5)
## A = 8.4198, p-value < 0.00000000000000022
ncvTest(model5)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 3.135075, Df = 1, p = 0.076625
hist(resid(model5))
vif(model5)
## GVIF Df GVIF^(1/(2*Df))
## age 2.506426 1 1.583170
## brand 36.812509 16 1.119276
## km_driven 3.288264 1 1.813357
## fuel 2.268820 3 1.146305
## seller_type 3.855163 2 1.401234
## transmission 6.989814 1 2.643826
## owner 1.730053 4 1.070921
## max_power_bhp 6.703728 1 2.589156
## seats 2.001459 1 1.414729
train_new['predict_selling_price'] <- exp(predict(model5,train_new))
train_new['percent_diff'] <- abs((train_new['selling_price'] - train_new['predict_selling_price'] )/train_new['selling_price']) * 100
ggplot(train_new, aes(x = selling_price, y = predict_selling_price)) + geom_point(size=2, shape=23) + geom_smooth(method = "lm", se=FALSE, color="red", formula = y ~ x, size=1,fullrange=TRUE) + stat_cor(method = "pearson") + scale_y_continuous(labels = scales::comma) + scale_x_continuous(labels = scales::comma) + ggtitle("Train Actual Predicted Correlation")
test['predict_selling_price'] <- exp(predict(model5,test))
test['percent_diff'] <- abs((test['selling_price'] - test['predict_selling_price'] )/test['selling_price']) * 100
ggplot(test, aes(x = selling_price, y = predict_selling_price)) + geom_point(size=2, shape=23) + geom_smooth(method = "lm", se=FALSE, color="red", formula = y ~ x, size=1,fullrange=TRUE) + stat_cor(method = "pearson") + scale_y_continuous(labels = scales::comma) + scale_x_continuous(labels = scales::comma) + ggtitle("Test Actual Predicted Correlation")
hist((train$km_driven))
boxplot(train$km_driven)
summary(train$km_driven)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 35000 60000 69294 96443 1500000
library(caret)
## Loading required package: lattice
var_imp <- varImp(model4)
ggplot(var_imp, aes(x= Overall, y=reorder(rownames(var_imp),Overall), fill=rownames(var_imp) )) +
geom_bar(stat="identity")+theme_minimal() + guides(fill=FALSE) + labs(title=paste0("Variable Importance Plot"),y= "features") + xlab("score")